{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from glob import glob\n",
    "from datetime import datetime\n",
    "\n",
    "DATA_PATH = \"./data/\"\n",
    "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
    "TEST_PATH = DATA_PATH + \"test.csv\"\n",
    "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
    "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
    "QUEST_PATH = DATA_PATH + \"question.csv\"\n",
    "\n",
    "train_data = pd.read_csv(TRAIN_PATH)\n",
    "test_data = pd.read_csv(TEST_PATH)\n",
    "question_data = pd.read_csv(QUEST_PATH)\n",
    "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
    "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
    "\n",
    "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
    "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
    "\n",
    "label = train_data[\"label\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "((10001, 300), (3049, 300))"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from keras.preprocessing.text import Tokenizer\n",
    "\n",
    "MAX_COUNT = 10000\n",
    "\n",
    "word_tokenizer = Tokenizer(MAX_COUNT)\n",
    "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
    "\n",
    "word_embedding_data = np.concatenate(\n",
    "    (\n",
    "        np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n",
    "        word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
    "    ),\n",
    "    axis=0\n",
    ")\n",
    "\n",
    "char_tokenizer = Tokenizer(MAX_COUNT)\n",
    "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n",
    "\n",
    "char_embedding_data = np.concatenate(\n",
    "    (\n",
    "        np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n",
    "        char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n",
    "    ),\n",
    "    axis=0\n",
    ")\n",
    "\n",
    "word_embedding_data.shape, char_embedding_data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((254386, 25),\n",
       " (254386, 25),\n",
       " (172956, 25),\n",
       " (172956, 25),\n",
       " (254386, 25),\n",
       " (254386, 25),\n",
       " (172956, 25),\n",
       " (172956, 25))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "SEQ_LEN = 25\n",
    "\n",
    "def gen_word_data(data):\n",
    "    seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n",
    "    seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n",
    "    return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
    "        pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
    "    \n",
    "def gen_char_data(data):\n",
    "    seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n",
    "    seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n",
    "    return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n",
    "        pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n",
    "\n",
    "word1, word2 = gen_word_data(train_data)\n",
    "char1, char2 = gen_char_data(train_data)\n",
    "test_word1, test_word2 = gen_word_data(test_data)\n",
    "test_char1, test_char2 = gen_char_data(test_data)\n",
    "\n",
    "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from keras.models import Model\n",
    "from keras.layers.merge import concatenate\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
    "from keras.optimizers import Adam, Nadam, SGD\n",
    "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n",
    "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPool1D\n",
    "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# general\n",
    "BATCH_SIZE = 512\n",
    "NUM_EPOCHES = 30\n",
    "DROP_RATE = 0.3\n",
    "PATIENCE = 8\n",
    "# cnn\n",
    "CONV_LEN_1 = 128\n",
    "CONV_LEN_2 = 128\n",
    "CONV_LEN_3 = 128\n",
    "CONV_LEN_4 = 128\n",
    "CONV_LEN_5 = 128\n",
    "CONV_LEN_6 = 128\n",
    "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n",
    "# lstm\n",
    "LSTM_SIZE1 = 256\n",
    "LSTM_SIZE2 = 256\n",
    "LSTM_DROP_RATE = 0.3\n",
    "# dense\n",
    "DENSE_SIZE1 = 512\n",
    "DENSE_SIZE2 = 256"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def cnn_layer1(step_input, filters, kernel_size):\n",
    "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
    "    conv_output = conv(step_input)\n",
    "    conv_output = GlobalMaxPool1D()(conv_output)\n",
    "    return conv_output\n",
    "\n",
    "def cnn_layer2(step_input, filters, kernel_size):\n",
    "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
    "    conv_output = conv(step_input)\n",
    "    conv_output = GlobalAveragePooling1D()(conv_output)\n",
    "    return conv_output\n",
    "\n",
    "def cnn_layer3(step_input, filters, kernel_size):\n",
    "    conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n",
    "    conv_output = conv(step_input)\n",
    "    conv_output1 = GlobalMaxPool1D()(conv_output)\n",
    "    conv_output2 = GlobalAveragePooling1D()(conv_output)\n",
    "    conv_output = concatenate([conv_output1, conv_output2])\n",
    "    return conv_output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "`NHWC` for data_format is deprecated, use `NWC` instead\n",
      "Train on 228946 samples, validate on 25440 samples\n",
      "Epoch 1/30\n",
      "228946/228946 [==============================] - 304s 1ms/step - loss: 0.3771 - acc: 0.8290 - val_loss: 0.2841 - val_acc: 0.8757\n",
      "Epoch 2/30\n",
      "228946/228946 [==============================] - 299s 1ms/step - loss: 0.2587 - acc: 0.8887 - val_loss: 0.2429 - val_acc: 0.8976\n",
      "Epoch 3/30\n",
      "228946/228946 [==============================] - 299s 1ms/step - loss: 0.2155 - acc: 0.9091 - val_loss: 0.2256 - val_acc: 0.9086\n",
      "Epoch 4/30\n",
      "228946/228946 [==============================] - 297s 1ms/step - loss: 0.1888 - acc: 0.9217 - val_loss: 0.2054 - val_acc: 0.9134\n",
      "Epoch 5/30\n",
      "228946/228946 [==============================] - 296s 1ms/step - loss: 0.1684 - acc: 0.9307 - val_loss: 0.2144 - val_acc: 0.9143\n",
      "Epoch 6/30\n",
      "228946/228946 [==============================] - 295s 1ms/step - loss: 0.1524 - acc: 0.9382 - val_loss: 0.2035 - val_acc: 0.9181\n",
      "Epoch 7/30\n",
      "228946/228946 [==============================] - 296s 1ms/step - loss: 0.1361 - acc: 0.9441 - val_loss: 0.2061 - val_acc: 0.9204\n",
      "Epoch 8/30\n",
      "228946/228946 [==============================] - 297s 1ms/step - loss: 0.1254 - acc: 0.9490 - val_loss: 0.2181 - val_acc: 0.9162\n",
      "Epoch 9/30\n",
      " 70144/228946 [========>.....................] - ETA: 3:17 - loss: 0.1033 - acc: 0.9583"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "best_results = []\n",
    "last_results = []\n",
    "\n",
    "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=word1, y=label)):  # word/char switch\n",
    "    train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]  # word/char switch\n",
    "    dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]  # word/char switch\n",
    "    \n",
    "    input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
    "    input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n",
    "\n",
    "    embedding_layer = Embedding(\n",
    "        input_dim=word_embedding_data.shape[0],  # word/char switch\n",
    "        output_dim=word_embedding_data.shape[1],  # word/char switch\n",
    "        weights=[word_embedding_data],  # word/char switch\n",
    "        input_length=SEQ_LEN,\n",
    "        trainable=False\n",
    "    )\n",
    "    \n",
    "    vector1 = embedding_layer(input1)\n",
    "    vector2 = embedding_layer(input2)\n",
    "    \n",
    "    lstm_layer1 = Bidirectional(LSTM(LSTM_SIZE1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n",
    "    layer1a = lstm_layer1(vector1)\n",
    "    layer1b = lstm_layer1(vector2)\n",
    "    lstm_layer2 = Bidirectional(LSTM(LSTM_SIZE2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n",
    "    layer2a = lstm_layer2(layer1a)\n",
    "    layer2b = lstm_layer2(layer1b)\n",
    "    layer2a = concatenate([vector1, layer2a])\n",
    "    layer2b = concatenate([vector2, layer2b])\n",
    "    \n",
    "    # TODO: 这里还可以添加BatchNorm层\n",
    "    \n",
    "    conv1a, conv1b = cnn_layer1(layer2a, filters=CONV_LEN_1, kernel_size=1), cnn_layer1(layer2b, filters=CONV_LEN_1, kernel_size=1)\n",
    "    conv2a, conv2b = cnn_layer1(layer2a, filters=CONV_LEN_2, kernel_size=2), cnn_layer1(layer2b, filters=CONV_LEN_2, kernel_size=2)\n",
    "    conv3a, conv3b = cnn_layer1(layer2a, filters=CONV_LEN_3, kernel_size=3), cnn_layer1(layer2b, filters=CONV_LEN_3, kernel_size=3)\n",
    "    conv4a, conv4b = cnn_layer1(layer2a, filters=CONV_LEN_4, kernel_size=4), cnn_layer1(layer2b, filters=CONV_LEN_4, kernel_size=4)\n",
    "    conv5a, conv5b = cnn_layer1(layer2a, filters=CONV_LEN_5, kernel_size=5), cnn_layer1(layer2b, filters=CONV_LEN_5, kernel_size=5)\n",
    "    conv6a, conv6b = cnn_layer1(layer2a, filters=CONV_LEN_6, kernel_size=6), cnn_layer1(layer2b, filters=CONV_LEN_6, kernel_size=6)\n",
    "    \n",
    "    merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n",
    "    merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n",
    "    diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n",
    "    mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n",
    "    merge = concatenate([diff, mult])\n",
    "    \n",
    "    x = Dropout(DROP_RATE)(merge)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n",
    "    x = Dropout(DROP_RATE)(x)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n",
    "    x = Dropout(DROP_RATE)(x)\n",
    "    x = BatchNormalization()(x)\n",
    "    pred = Dense(1, activation=\"sigmoid\")(x)\n",
    "    \n",
    "    model = Model(inputs=[input1, input2], outputs=pred)\n",
    "    model.compile(\n",
    "        optimizer=\"nadam\",\n",
    "        loss=\"binary_crossentropy\",\n",
    "        metrics=[\"acc\"]\n",
    "    )\n",
    "\n",
    "    early_stopping = EarlyStopping(\"val_loss\", patience=PATIENCE)\n",
    "    check_point = ModelCheckpoint(\n",
    "        \"./log/%s.TextRCNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
    "        monitor=\"val_loss\",\n",
    "        save_best_only=True,\n",
    "    )\n",
    "    \n",
    "    fit_res = model.fit(\n",
    "        x=[train_x1, train_x2],\n",
    "        y=train_y,\n",
    "        batch_size=BATCH_SIZE,\n",
    "        epochs=NUM_EPOCHES,\n",
    "        validation_data=([dev_x1, dev_x2], dev_y),\n",
    "        shuffle=True,\n",
    "        callbacks=[early_stopping, check_point]\n",
    "    )\n",
    "    \n",
    "    pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
    "    last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n",
    "    \n",
    "    print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n",
    "    model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n",
    "    pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)  # word/char switch\n",
    "    best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n",
    "\n",
    "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
    "    \"./result/%s-TextRCNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
    "    index=False\n",
    ")\n",
    "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n",
    "    \"./result/%s-TextRCNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),  # word/char switch\n",
    "    index=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
